import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import seaborn as sns
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
import warnings
warnings.simplefilter('ignore')
from sklearn.datasets import load_boston
boston_dataset = load_boston()
type(boston_dataset)
boston_dataset
dir(boston_dataset)
print(boston_dataset.DESCR)
print(boston_dataset.data)
print(boston_dataset.feature_names)
print(boston_dataset.filename)
print(boston_dataset.target)
data=pd.DataFrame(data = boston_dataset.data, columns = boston_dataset.feature_names)
data
data['PRICE'] = boston_dataset.target
data
data.head()
data.tail()
data.count()
pd.isnull(data)
pd.isnull(data).any()
data.info()
# plotting...visualisation
plt.hist(data['PRICE'])
plt.show()
plt.hist(data['PRICE'], bins = 30)
plt.show()
plt.figure(figsize =(18, 8))
plt.hist(data['PRICE'], bins = 30, ec = 'black')
plt.xlabel('Price in 1000\'s of Dollars', fontsize = 20)
plt.ylabel('Nr of Houses', fontsize = 20)
plt.show()
data['PRICE'].mean()
data['PRICE'].median()
# mean and median are different.....SO that means the data is not normally distributed
# what is a normal distribution
# seaborn............
sns.distplot(data['PRICE'])
plt.show()
plt.figure(figsize =(18, 8))
sns.distplot(data['PRICE'], hist=True,kde=False, hist_kws={'ec':'black', 'color': 'red', 'alpha': 0.3})
plt.show()
plt.figure(figsize =(18, 8))
sns.distplot(data['PRICE'], hist=True,kde=True, hist_kws={'ec':'black', 'color': 'red', 'alpha': 0.3})
plt.show()
plt.figure(figsize =(18, 8))
sns.distplot(data['PRICE'], hist=True,kde=False, hist_kws={'ec':'black', 'color': 'red', 'alpha': 0.3})
plt.show()
plt.figure(figsize =(18, 8))
sns.distplot(data['PRICE'], hist=False,kde=True, kde_kws={'color': 'red'})
plt.show()
data['RM'].mean()
data['RM'].median()
plt.figure(figsize =(18, 8))
sns.distplot(data['RM'], hist=True,kde=False, hist_kws={'ec':'brown', 'color': 'black', 'alpha': 0.3})
plt.show()
data['CHAS']
data['CHAS'].value_counts()
data['CHAS'].values
data['CHAS'].index
type(data['CHAS'])
plt.figure(figsize =(10,6))
plt.hist(data['RAD'], ec = 'black', color = 'navy', alpha = 0.5)
plt.xlabel('Accessibility to Highways')
plt.ylabel('Nr of Houses')
plt.show()
accessibility=data['RAD'].value_counts()
accessibility
type(accessibility)
accessibility.index
type(accessibility.index)
accessibility.values
type(accessibility.values)
plt.figure(figsize =(10,6))
plt.bar(accessibility.index, accessibility.values, ec = 'black', width = 0.5)
plt.show()
data.min()
data.max()
data.median()
data.mean()
data.describe()
# why plotting.......
Anscombe's quartet comprises four data sets that have nearly identical simple descriptive statistics, yet have very different distributions and appear very different when graphed. Each dataset consists of eleven (x,y) points. They were constructed in 1973 by the statistician Francis Anscombe to demonstrate both the importance of graphing data before analyzing it and the effect of outliers and other influential observations on statistical properties.
# Correlation
data['PRICE'].corr(data['RM'])
plt.figure(figsize = (18, 12))
plt.scatter(data['PRICE'], data['RM'], color = 'red', alpha = 0.4)
plt.show()
data['PRICE'].corr(data['PTRATIO'])
# why is this negative?
# I will give you 10 minutes for the answer....
plt.figure(figsize = (18, 12))
plt.scatter( data['PTRATIO'],data['PRICE'], color = 'navy', alpha = 0.4)
plt.show()
data['NOX'].corr(data['PRICE'])
plt.figure(figsize = (18, 12))
plt.scatter( data['NOX'],data['PRICE'], color = 'black', alpha = 0.4)
plt.show()
data['CHAS'].corr(data['PRICE']) # it means the houses close to the river banks are priced marginally higher
data.corr()
# Heat map
plt.figure(figsize =(16,10))
sns.heatmap(data.corr(), annot = True, annot_kws = {'size':14}, fmt = '.1g', mask=None)
plt.show()
# mask : boolean array or DataFrame, optional
# If passed, data will not be shown in cells where ``mask`` is True.
# Cells with missing values are automatically masked.
data.corr().shape
masker = np.zeros_like(data.corr())
masker.shape
masker
triangle_indices=np.triu_indices_from(masker)
masker[triangle_indices] = True
masker
plt.figure(figsize =(16,10))
sns.heatmap(data.corr(), annot = True, annot_kws = {'size':14}, fmt = '.1g', mask=masker, linewidths = 0.3)
plt.xticks(fontsize = 10)
plt.yticks(fontsize = 10)
plt.show()
# Home work....
# why is there a -ve correlation between the PRICE and LSTAT
# why is there a +ve correlation between the TAX and NOX
# why is there a -ve correlation between the DIS and NOX
# why is there a -ve correlation between the PRICE and LSTAT
# LSTAT % lower status of the population
# Lower status....low income...not very educated....more crimes...more fights...more problems in the area...
# would you want to go and buy a house there?.....no...not at all...
# hence houses in such areas will have low demand..and hence the negative correlation...between PRICE and LSTAT
# to prove that there is a corelation between...LSTAT and CRIM lets examine the data...
# heat map....for the CORR
# there is a positive correlation between CRIM and LSTAT...that proves the point..
# why is there a +ve correlation between the TAX and NOX
# NOX.....nitrous oxide pollutant found in areas where there are industries...
# TAX.....looks to be higher in areas where there are more industries...that sounds logical..
# INDUS proportion of non-retail business acres per town?....what is that?.....is that Industrial Area or Residential Area?
# if INDUS is industrial Area...then there must a positive correlation between INDUS and NOX....INDUS and TAX
# we have found that there is a positive correlation between INDUS and TAX...0.7
# we have found that there is a positive correlation between INDUS and NOX....0.8
# so INDUSTRIAL AREAS HAVE TO PAY HIGHER TAX, ALSO THEY ARE MORE POLLUTED..
# why is there a -ve correlation between the DIS and NOX...
# DIS weighted distances to five Boston employment centres...
# NOX nitrous oxide concentration..
# Employment centres were mostly factories those days...located in industrial areas..
# far away from the industrial area then there will less NOX...and hence the neagative correalation...
# there must be a -ve correlation between DIS and TAX..?
# can we check? ...its negative -0.5....
# seaborn plotting...
# import seaborn as sns
# kind : { "scatter" | "reg" | "resid" | "kde" | "hex" }, optional
sns.set()
sns.set_style('darkgrid')
sns.jointplot(x =data['NOX'] , y =data['DIS'], color='red', alpha = 0.3, height=15, ratio=6, kind='scatter')
plt.show()
sns.set()
sns.set_style('white')
sns.lmplot(x = 'TAX', y = 'RAD', data = data, height=10, aspect=2)
plt.show()
sns.set()
sns.set_style('white')
sns.lmplot(x = 'RM', y = 'PRICE', data = data, height=10, aspect=2)
plt.show()
sns.set()
sns.set_style('whitegrid')
sns.lmplot(x = 'RM', y = 'PRICE', data = data, height=10, aspect=2, ci = None)
plt.show()
# Multivariable regression...
data
prices = data['PRICE']
prices
features = data.drop('PRICE', axis =1)
features
# split this data into train and test...
# train...x_train and y_train...x_train will hold 80% of the features...y_train will hold 80% the prices
# test....x_test and y_test....x_test will hold 20% the featues...y_test will hold 20% the prices
# from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, prices, test_size = 0.2, random_state = 10)
X_train
y_train
X_test
y_test
len(features)
len(X_train)
round(len(X_train)/len(features),2)*100 # train_size
round(len(X_test)/len(features),2)*100 # test_size
regr = LinearRegression() # training the model
regr.fit(X_train, y_train)
regr.coef_
regr.intercept_
pd.DataFrame(data =regr.coef_ , index = X_train.columns, columns = ['Coef'])
regr.score(X_train, y_train)
regr.score(X_test, y_test)
data['PRICE']
plt.figure(figsize= (16,8))
plt.hist(data['PRICE'], bins = 50)
plt.show()
data['PRICE'].skew()
# Data Transformation...
A = 90
B = 50
A-B
np.log(A)
np.log(B)
np.log(A)-np.log(B)
y_log=np.log(data['PRICE'])
y_log
y_log.skew()
plt.figure(figsize= (16,8))
plt.hist(y_log, bins = 50)
plt.show()
plt.figure(figsize= (16,8))
plt.subplot(1,2,1)
plt.hist(data['PRICE'], bins = 50, color = 'red', alpha = 0.4)
plt.subplot(1,2,2)
plt.hist(y_log, bins = 50, alpha = 0.5)
plt.show()
# do a regression after data transformation ...that means...prices will be log prices..
data
log_prices = np.log(data['PRICE'])
features = data.drop(['PRICE'], axis = 1)
log_prices
features
X_train, X_test, y_train, y_test =train_test_split(features, log_prices, test_size = 0.2, random_state = 10)
regr.fit(X_train, y_train)
regr.coef_
pd.DataFrame(data=regr.coef_, index=X_train.columns, columns=['coef']) # after data transformation...
regr.intercept_
regr.score(X_train, y_train)
regr.score(X_test, y_test)
# So, data transformation has improved the scores....
# So we will keep the transformation..
# Plots with sns....
sns.set()
plt.figure(figsize=(20,6))
sns.distplot(y_log)
plt.title(f'Log price: The skew is {round(y_log.skew(),2)}', fontsize = 25)# note the f string used to print the y_log.skew()
plt.xlabel('LOG PRICES ', fontsize =14, color = 'green' )
plt.show()
skew=round(data['PRICE'].skew(),2)
sns.set()
plt.figure(figsize=(20,6))
sns.distplot(data['PRICE'])
plt.title(f'Actual price: The skew is {skew}', fontsize = 25)
plt.xlabel( 'ACTUAL PRICES ', fontsize =14, color = 'green' )
plt.show()
# we are going to conduct further tests on the data.....
# do we need all these features ...`13 of them..?
# Is there a problem of multicollinearity amongst the features...check VIF....?
# Are all these features really related to the price..target....check for p values???
# hypothesis testing...
# EXP HYP:-...... Varma is in love with Rani.....
# NULL HYP:-....Varma is not in love with Rani....
# GET DATA:-.... based on the null hypothesis...Varma is not in love with Rani.....
# then we are calculating the Prob...of what?... Varma is not in love with Rani....
# if P(Varma is not in love with Rani) > 0.05....null hpy may be possible...and varma might not be in love with Rani..
# if p(Varma is not in love with Rani) < 0.05 ...then null is not possible..So maybe Varma is in love with Rani...
# sklearn.....
# statsmodels....will allow me to do a lot of tests..also do the regression..
results=sm.OLS(y_train, X_train).fit()
# we have not added the constant to the exog...so we did not get the constant..intercept and also got wrong values of the coef..
results.params
X_train
sm.add_constant(X_train)
results=sm.OLS(y_train, sm.add_constant(X_train)).fit() # used statsmodels...SM
results.params
round(results.pvalues,3)
pd.DataFrame({'coef': results.params, 'p-value': round(results.pvalues, 3)})
# Multicollinearity....
# VIF....if VIF > 10...there is multicollinearity...then we might have to drop the feature...just might have to...
variance_inflation_factor(exog =sm.add_constant(X_train).values, exog_idx = 10 )
variance_inflation_factor(exog =sm.add_constant(X_train).values, exog_idx = 1)
for i in range(len(sm.add_constant(X_train).columns)):
print(variance_inflation_factor(exog =sm.add_constant(X_train).values, exog_idx = i),'\n')
# INDUS and AGE have p values greater than 0.05
# VIF looks okay....
# Bayesian information criterion.....BIC
# BIC?....Deals with what?....Deals with overfitiing or overlearning...
results.bic
# Build our models.....
# All features and the log prices....Model 1
# without INDUS and the log prices....Model2
# Without INDUS and AGE with log prices...model3
# we will look at the BIC of all these models...
# we will look at the r sqaure of all these models..
X_incl_const = sm.add_constant(X_train)
model = sm.OLS(y_train, X_incl_const)
results_1 = model.fit()
results_1.params
results_1.bic
results_1.rsquared
X_incl_const
X_incl_const = X_incl_const.drop(['INDUS'], axis =1)
model = sm.OLS(y_train, X_incl_const)
results_2 = model.fit()
results_2.bic
results_2.rsquared
X_incl_const = X_incl_const.drop(['AGE'], axis =1) # already removed INDUS
model = sm.OLS(y_train, X_incl_const)
results_3 = model.fit()
results_3.bic
results_3.rsquared
# So what?....the model to go with is model...3...with the least BIC...